Setup

library(biomaRt)
package ‘biomaRt’ was built under R version 3.6.3Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
library(DESeq2)
Loading required package: S4Vectors
package ‘S4Vectors’ was built under R version 3.6.3Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport,
    clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply,
    parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname,
    do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
    is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which, which.max, which.min


Attaching package: ‘S4Vectors’

The following object is masked from ‘package:base’:

    expand.grid

Loading required package: IRanges
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
package ‘GenomeInfoDb’ was built under R version 3.6.3Loading required package: SummarizedExperiment
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'. To cite
    Bioconductor, see 'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: DelayedArray
package ‘DelayedArray’ was built under R version 3.6.3Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following objects are masked from ‘package:Biobase’:

    anyMissing, rowMedians

Loading required package: BiocParallel

Attaching package: ‘DelayedArray’

The following objects are masked from ‘package:matrixStats’:

    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

The following objects are masked from ‘package:base’:

    aperm, apply, rowsum

Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
library(sleuth)
library(tximport)
package ‘tximport’ was built under R version 3.6.3
library(jsonlite)
library(data.table)
data.table 1.12.8 using 4 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: ‘data.table’

The following object is masked from ‘package:SummarizedExperiment’:

    shift

The following object is masked from ‘package:GenomicRanges’:

    shift

The following object is masked from ‘package:IRanges’:

    shift

The following objects are masked from ‘package:S4Vectors’:

    first, second
library(vsn)
library(EnhancedVolcano)
Loading required package: ggplot2
Loading required package: ggrepel
library(viridis)
Loading required package: viridisLite
library(ggpubr)
Loading required package: magrittr
library(MASS)

Attaching package: ‘MASS’

The following object is masked from ‘package:biomaRt’:

    select
library(rtracklayer)
library(VennDiagram)
Loading required package: grid
Loading required package: futile.logger

Attaching package: ‘VennDiagram’

The following object is masked from ‘package:ggpubr’:

    rotate
library(RColorBrewer)
library(ComplexHeatmap)
========================================
ComplexHeatmap version 2.2.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.
========================================
library(RColorBrewer)
library(circlize)
========================================
circlize version 0.4.8
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: http://jokergoo.github.io/circlize_book/book/

If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization 
  in R. Bioinformatics 2014.
========================================
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ tibble  3.0.1     ✓ dplyr   0.8.5
✓ tidyr   1.0.2     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
✓ purrr   0.3.4     
── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::between()    masks data.table::between()
x dplyr::collapse()   masks IRanges::collapse()
x dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
x dplyr::count()      masks matrixStats::count()
x dplyr::desc()       masks IRanges::desc()
x tidyr::expand()     masks S4Vectors::expand()
x tidyr::extract()    masks magrittr::extract()
x dplyr::filter()     masks stats::filter()
x dplyr::first()      masks data.table::first(), S4Vectors::first()
x purrr::flatten()    masks jsonlite::flatten()
x dplyr::lag()        masks stats::lag()
x dplyr::last()       masks data.table::last()
x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
x dplyr::rename()     masks S4Vectors::rename()
x dplyr::select()     masks MASS::select(), biomaRt::select()
x purrr::set_names()  masks magrittr::set_names()
x purrr::simplify()   masks DelayedArray::simplify()
x dplyr::slice()      masks IRanges::slice()
x purrr::transpose()  masks data.table::transpose()
set.seed(42)
RESPONSE_COLORS = c("NR"="purple","R"="orange")

Setup sample details

# Load in samples and point to kallisto paths
samples <- read_tsv("../work/input/sample_manifest.tsv")
Parsed with column specification:
cols(
  `Patient ID` = col_character(),
  `GSM ID` = col_character(),
  Response = col_character(),
  SRR = col_character()
)
samples$kallisto_path <- file.path("..","work","intermediate","quant",samples$SRR)

Transcript <-> Gene Mappings

mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
  dataset = "hsapiens_gene_ensembl",
  host = 'grch37.ensembl.org')
gene_features <- biomaRt::getBM(attributes = c("ensembl_transcript_id_version", "ensembl_gene_id",
    "external_gene_name","transcript_length"), mart = mart)
Cache found
t2g <- gene_features %>%
  select(ensembl_transcript_id_version,ensembl_gene_id,external_gene_name)
avg_tx_len <-gene_features %>%
  group_by(external_gene_name) %>%
  summarise(length=mean(transcript_length))

QC Filtering

We’ll filter high quality samples by percent reads mapped by Kallisto. We’ll need to get hte total reads in files from the kallisto logs, and the total number of mapped reads by pre-loading all the counts using sleuth.

# Find total reads to compute percent mapped
samples$n_reads <- apply(samples,1,function(x)read_json(file.path(x["kallisto_path"],"run_info.json"))$n_processed)
# Initial sleuth just to get kallisto counts
s_obj_counts <- sleuth_prep(
  sample_to_covariates = rename(samples,path=kallisto_path,sample=SRR),
  normalize=FALSE
)
It appears that you are running Sleuth from within Rstudio.
Because of concerns with forking processes from a GUI, 'num_cores' is being set to 1.
If you wish to take advantage of multiple cores, please consider running sleuth from the command line.reading in kallisto results
dropping unused factor levels
................
86131 targets passed the filter
kallisto_est_mapped <- s_obj_counts$obs_raw %>%
  group_by(sample) %>%
  summarise(kallisto_mapped_reads=sum(est_counts))
samples <- samples %>% 
  left_join(kallisto_est_mapped,by=c("SRR"="sample")) %>%
  mutate(percent_mapped=kallisto_mapped_reads/n_reads) 
samples %>%
  ggplot(aes(x = percent_mapped)) +
    geom_histogram(bins = 10) +
    labs(title = "Distribution of % Mapped Reads", x = "% Mapped Reads")

# Filter samples
samples_filt <-samples %>% 
  filter(
    !SRR %in% c("SRR8526726"),
    #percent_mapped > 0.8
    )

Sleuth

PCA

Sleuth default

plot_pca(s_obj,units="scaled_reads_per_base",color_by = "Response",use_filtered = TRUE,text_labels = F) + geom_label(aes(label=sample),size=2)

Standardized PCA


sleuth_rna_norm <- s_obj$obs_norm_filt %>% 
  select(sample,target_id,scaled_reads_per_base) %>%
  spread(key=target_id,value=scaled_reads_per_base) %>%
  column_to_rownames("sample")
sleuth_std_pca <- prcomp((sleuth_rna_norm),center=T,scale.=T)
sleuth_variance_explained = summary(sleuth_std_pca)$importance[2,1:2]
sleuth_std_pca$x[,1:2] %>%
  as.data.frame() %>%
  rownames_to_column("SRR") %>%
  left_join(samples,by="SRR") %>%
  ggplot(aes(x=PC1,y=PC2,color=Response,label=SRR)) +
    geom_point(size=3) +
    labs(title="PCA Plot of Sleuth Normalized Kallisto Quantified Data",
         x = paste0("PC1: ", round(sleuth_variance_explained[1]*100),"% Variance"),
         y = paste0("PC2: ", round(sleuth_variance_explained[2]*100),"% Variance")) +
    scale_color_manual(values=RESPONSE_COLORS)

Diffex

LRT

s_obj <- s_obj %>%
  sleuth_fit(~Response,"full") %>%
  sleuth_fit(~1,"reduced") %>%
  sleuth_lrt("reduced","full")
fitting measurement error models
shrinkage estimation
computing variance of betas
fitting measurement error models
shrinkage estimation
computing variance of betas
sleuth_diffex_lrt <- sleuth_results(s_obj,test="reduced:full",test_type="lrt") %>% filter(!is.na(pval))
sleuth_diffex_lrt %>% arrange(qval)

Wald

s_obj <- sleuth_wt(s_obj,"ResponseR")
sleuth_diffex_wald <- sleuth_results(s_obj,test="ResponseR",test_type="wald")  %>% filter(!is.na(pval))
sleuth_diffex_wald %>% arrange(qval) 
plot_volcano(s_obj,test_type="wt",test="ResponseR",sig_level=0.05) +
  labs(title="Volcano Plot of Sleuth Identified Differentially Expressed Genes",
       x="log2FC", y = "qval",
        color="Q < 0.05")

QC

meanSdPlot(log2(sleuth_to_matrix(s_obj,"obs_norm","scaled_reads_per_base") + 1),rank=FALSE,bins=100)

DESeq (Kallisto)

files <- file.path(samples_filt$kallisto_path,"abundance.h5")
names(files) <- samples_filt$`Patient ID`
txi <- tximport(
  file.path(samples_filt$kallisto_path,"abundance.h5"),
  type = "kallisto",
  tx2gene = select(
    t2g,
    TXNAME = ensembl_transcript_id_version,
    GENEID = external_gene_name
  ),
  ignoreAfterBar = TRUE,
)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates
rownames(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
deseq_kallisto_data <- DESeqDataSetFromTximport(txi,samples_filt,~Response)
some variables in design formula are characters, converting to factorsusing counts and average transcript lengths from tximport
kallisto_keep <- rowSums(counts(deseq_kallisto_data)>5)>5
deseq_kallisto_data <- deseq_kallisto_data[kallisto_keep,]

QC

deseq_kallisto_norm <- vst(deseq_kallisto_data)
using 'avgTxLength' from assays(dds), correcting for library size
meanSdPlot(assay(deseq_kallisto_norm),ranks=FALSE,bins=100)

PCA

plotPCA(deseq_kallisto_norm,intgroup="Response",ntop=500000)   +
  labs(title="PCA Plot of DESeq Normalized Kallisto Quantified Data",
       color="Response") +
  scale_color_manual(values=RESPONSE_COLORS)

Difex

deseq_kallisto_data <- DESeq(deseq_kallisto_data)
estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1306 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
deseq_kallisto_lrt <- results(deseq_kallisto_data,name="Response_R_vs_NR",tidy=TRUE)
deseq_kallisto_lrt %>% 
  #filter(padj < 0.05) %>%
  arrange(padj) %>%
  select(row,log2FoldChange,pvalue,padj) %>%
  left_join(select(t2g,ensembl_gene_id,external_gene_name),by=c("row"="ensembl_gene_id"))

DESeq (FeatureCounts)

fc_mat <- read_tsv("../work/input/GSE126044_counts.txt")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  X1 = col_character(),
  Dis_01 = col_double(),
  Dis_02 = col_double(),
  Dis_11 = col_double(),
  Dis_12 = col_double(),
  Dis_15 = col_double(),
  Dis_04 = col_double(),
  Dis_16 = col_double(),
  Dis_17 = col_double(),
  Dis_03 = col_double(),
  Dis_07 = col_double(),
  Dis_10 = col_double(),
  Dis_18 = col_double(),
  Dis_09 = col_double(),
  Dis_05 = col_double(),
  Dis_06 = col_double(),
  Dis_08 = col_double()
)
 fc_mat <- fc_mat[!grepl("^\\d+\\-",fc_mat$X1),] %>%
   column_to_rownames("X1")
 
fc_mat_filt <-  fc_mat %>%
   select(one_of(samples_filt$`Patient ID`))
deseq_fc <- DESeqDataSetFromMatrix(countData=fc_mat_filt,colData=samples_filt,design=~Response)
converting counts to integer mode
some variables in design formula are characters, converting to factors
fc_keep <- rowSums(counts(deseq_fc)>5)>5
deseq_fc <- deseq_fc[fc_keep,]

QC

deseq_fc_norm <- vst(deseq_fc)
meanSdPlot(assay(deseq_fc_norm),rank=FALSE,bins=100)

PCA

plotPCA(deseq_fc_norm,intgroup="Response",ntop=500000) +
  labs(title="PCA Plot of DESeq Normalized Feature Counts/STAR Quantified Data",
       color="Response") +
  scale_color_manual(values=RESPONSE_COLORS)

Difex

Merged Analyses

counts_to_rpm, pca

tfc <- t(fc_mat_filt)
# Convert Kallisto counts to rpm
kalread<-cbind(kallisto_mapped_reads=samples_filt$kallisto_mapped_reads, sleuth_rna_norm %>% rownames_to_column("sample"))
kalrpm<-setDT(kalread) [, lapply(.SD, function(x) (x * 1e6) / (kallisto_mapped_reads)),  by=.(kallisto_mapped_reads,sample)] %>% 
  column_to_rownames("sample")

# Convert star fc to rpm
#get df with est total reads for star
staread<-cbind(star_est_mapped=colSums(fc_mat_filt), as.data.frame(tfc) %>% rownames_to_column("sample"))
starpm<-setDT(staread) [, lapply(.SD, function(x) (x * 1e6) / (star_est_mapped)),  by=.(star_est_mapped,sample)] %>%
  column_to_rownames("sample")

## scale pca with rpm
kal_rpm_pca <- prcomp(kalrpm[,-1],center=TRUE,scale.=TRUE)

starnorm_std <- scale(starpm[,-1])
#star counts include some zero, set scale/std apply to those !=0
star_rpm_pca <- prcomp(starnorm_std[ , which(apply(starnorm_std, 2, var) != 0)],center=TRUE)

## superimposed pca plots from the two sources
p1 <-kal_rpm_pca$x[,1:2]%>%
  as.data.frame() %>%
  rownames_to_column("SRR") %>%
  left_join(samples_filt,by="SRR")
p2<-star_rpm_pca$x[,1:2] %>%
  as.data.frame() %>%
  rownames_to_column("Patient ID") %>%
  left_join(samples_filt,by="Patient ID")
bind_rows(Kallisto=p1,STAR=p2,.id="quant_type") %>%
  ggplot(aes(x=PC1,y=PC2,color=Response,shape=quant_type)) +
    geom_point(size=3) +
    scale_color_manual(values=RESPONSE_COLORS) + 
    labs(title="Superimposed PCA of RPM Normalized Counts",
         shape="Method")

Comparison of log2FC

equation = function(x) {
  lm_coef <- list(a = round(as.numeric(coef(x)[1]), digits = 2),
                  b = round(as.numeric(coef(x)[2]), digits = 2),
                  r2 = round(summary(x)$r.squared, digits = 2));
  lm_eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,lm_coef)
  as.character(as.expression(lm_eq));                 
}

log2fc_combined <- sleuth_diffex_wald %>%
  filter(!is.na(b)) %>%
  select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
  left_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
  left_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row")) %>%
  filter(!is.na(deseq_fc_log2FC))

regression.line <- lm(deseq_kal_log2FC ~ sleuth_log2FC, data = log2fc_combined)
log2fc_combined %>%
  ggplot(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
  geom_point(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
  geom_hline(yintercept=0, color="darkgray") + 
  geom_vline(xintercept=0, color="darkgray") +
  theme_pubr(base_size = 14)+
  geom_smooth(method = "lm", color="red") +
  annotate("text", label = equation(regression.line), x = -3, y = 4, parse = TRUE)+
  labs(title="Comparison of log2FC between Sleuth and DESeq Kallisto",
       x="log2 Fold Change (Sleuth-Kallisto)",y="log2 Fold Change (DESeq-Kallisto)")



regression.line <- lm(deseq_fc_log2FC ~ deseq_kal_log2FC, data = log2fc_combined)
log2fc_combined %>%
  ggplot(aes(x=deseq_kal_log2FC,y=deseq_fc_log2FC)) +
  geom_point() +
  geom_hline(yintercept=0, color="darkgray") + 
  geom_vline(xintercept=0, color="darkgray") +
  theme_pubr(base_size = 14)+
  geom_smooth(method = "lm", color="red") +
  annotate("text", label = equation(regression.line), x = -4, y = 5, parse = TRUE)+
  labs(title="Comparison of log2FC between DESeq From Kallisto and FC",
       x="log2 Fold Change (DESeq-FeatureCounts)",y="log2 Fold Change (DESeq-Kallisto)")

combine all logFC data

sleuth_diffex_wald <- sleuth_diffex_wald %>% filter(!is.na(b)) 
deseq_fc_lrt <- deseq_fc_lrt %>% filter(!is.na(log2FoldChange)) 
deseq_kallisto_lrt <- deseq_kallisto_lrt %>% filter(!is.na(log2FoldChange)) 

log2fc_full_combined <- sleuth_diffex_wald %>%
  select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
  full_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
  full_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row"))
log2fc_full_combined
NA

Volcano Plots

Slueth


log2fc_full_combined$log2FoldChange <- log2fc_full_combined$sleuth_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$sleuth_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -3))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 3))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 27
table(keyvals.color)
keyvals.color
    green       red royalblue 
    16210      1794      1338 
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - sleuth',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,1.5),
                selectLab = log2fc_final$gene[up.down.genes]
                )

deseq_kal


log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_kal_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_kal_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -20))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 5))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 6
table(keyvals.color)
keyvals.color
    green       red royalblue 
    16311      2059      2369 
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - Deseq-kallisto',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,10),
                selectLab = log2fc_final$gene[up.down.genes]
                )

deseq_fc

log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_fc_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_fc_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -8))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 4))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)
[1] 9
table(keyvals.color)
keyvals.color
    green       red royalblue 
    12879      1621      1650 
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - Deseq-fc',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,15),
                selectLab = log2fc_final$gene[up.down.genes]
                )

Venn diagram

thres <- 1
# get the name of the upregulated/downregulated genes from 3 different methods
sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
deseq.k.up <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange >= thres))]
deseq.k.down <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange <= -thres))]
deseq.fc.up <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange >= thres))]
deseq.fc.down <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange <= -thres))]

# up-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.up),
                            "DESeq-Kallisto" = (deseq.k.up),
                            "DESeq-FeatureCounts" = (deseq.fc.up)),
                    main="Venn Diagram of Upregulated Genes Between 3 Methods",
                   fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)


# down-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.down),
                            "DESeq-Kallisto" = (deseq.k.down),
                            "DESeq-FeatureCounts" = (deseq.fc.down)),
                   main="Venn Diagram of Downregulated Genes Between 3 Methods",
                   fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)

Heatmap


#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]

normalized_counts <- counts(deseq_kallisto_data, normalized=TRUE)
colnames(normalized_counts)
 [1] "Dis_01" "Dis_02" "Dis_03" "Dis_04" "Dis_05" "Dis_06" "Dis_07" "Dis_08" "Dis_09" "Dis_11"
[11] "Dis_12" "Dis_15" "Dis_16" "Dis_17" "Dis_18"
updown.gene <- deseq_kallisto_lrt[which((deseq_kallisto_lrt$log2FoldChange > 1 | deseq_kallisto_lrt$log2FoldChange < -1) & deseq_kallisto_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")

updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])

heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]

zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)

col_updown = c("up" = "pink", "down" = "purple")

row.names(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
samples_filt

ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",  
                       col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)
[1] 20739    15
ht1 = Heatmap(zscore_data, name = "expression",
        col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
        show_column_names = FALSE, show_row_names=FALSE,
        row_title = NULL,
        bottom_annotation = ha,
        heatmap_width = 400, heatmap_height = 100)

ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)


#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
normalized_counts <- counts(deseq_fc, normalized=TRUE)

updown.gene <- deseq_fc_lrt[which((deseq_fc_lrt$log2FoldChange > 1 | deseq_fc_lrt$log2FoldChange < -1) & deseq_fc_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")

updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])

heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]

zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)

col_updown = c("up" = "pink", "down" = "purple")

row.names(samples_filt) <- samples_filt$`Patient ID`
Setting row names on a tibble is deprecated.
samples_filt

ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",  
                       col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)
[1] 16150    15
ht1 = Heatmap(zscore_data, name = "expression",
        col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
        show_column_names = FALSE, show_row_names=FALSE,
        row_title = NULL,
        bottom_annotation = ha,
        heatmap_width = 400, heatmap_height = 100)

ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)

Sample overview table

sample_summary <- data.frame(fc_mapped_reads=colSums(fc_mat)) %>% 
  rownames_to_column("Patient ID") %>% 
  left_join(samples,by="Patient ID") %>%
  select(`Patient ID`,SRR,Response,"Sequenced Reads"=n_reads,"Mapped Reads(Kallisto)"=kallisto_mapped_reads,"Mapped Reads(STAR/FeatureCounts)"=fc_mapped_reads)
sample_summary

Write out relevant files

sleuth_diffex_lrt %>% write_tsv("sleuth_diffex_lrt.tsv")
sleuth_diffex_wald %>% write_tsv("sleuth_diffex_wald.tsv")
deseq_kallisto_lrt %>% write_tsv("deseq_kallisto_lrt.tsv")
deseq_fc_lrt %>% write_tsv("deseq_fc_lrt.tsv")
sample_summary%>% write_tsv("sample_summary.tsv")
---
title: "Comparisons of Differential Expression Tools on Different Data Sources"
output: html_notebook
---

# Setup
```{r}
library(biomaRt)
library(DESeq2)
library(sleuth)
library(tximport)
library(jsonlite)
library(data.table)
library(vsn)
library(EnhancedVolcano)
library(viridis)
library(ggpubr)
library(MASS)
library(rtracklayer)
library(VennDiagram)
library(RColorBrewer)
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
library(tidyverse)
set.seed(42)
RESPONSE_COLORS = c("NR"="purple","R"="orange")
```

## Setup sample details
```{r}
# Load in samples and point to kallisto paths
samples <- read_tsv("../work/input/sample_manifest.tsv")
samples$kallisto_path <- file.path("..","work","intermediate","quant",samples$SRR)
```

## Transcript <-> Gene Mappings
```{r}
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
  dataset = "hsapiens_gene_ensembl",
  host = 'grch37.ensembl.org')
gene_features <- biomaRt::getBM(attributes = c("ensembl_transcript_id_version", "ensembl_gene_id",
    "external_gene_name","transcript_length"), mart = mart)
t2g <- gene_features %>%
  select(ensembl_transcript_id_version,ensembl_gene_id,external_gene_name)
avg_tx_len <-gene_features %>%
  group_by(external_gene_name) %>%
  summarise(length=mean(transcript_length))
```

## QC Filtering
We'll filter high quality samples by percent reads mapped by Kallisto. We'll need to get hte total reads in files from the kallisto logs, and the total number of mapped reads by pre-loading all the counts using sleuth. 
```{r}
# Find total reads to compute percent mapped
samples$n_reads <- apply(samples,1,function(x)read_json(file.path(x["kallisto_path"],"run_info.json"))$n_processed)
```

```{r}
# Initial sleuth just to get kallisto counts
s_obj_counts <- sleuth_prep(
  sample_to_covariates = rename(samples,path=kallisto_path,sample=SRR),
  normalize=FALSE
)
```

```{r}
kallisto_est_mapped <- s_obj_counts$obs_raw %>%
  group_by(sample) %>%
  summarise(kallisto_mapped_reads=sum(est_counts))
samples <- samples %>% 
  left_join(kallisto_est_mapped,by=c("SRR"="sample")) %>%
  mutate(percent_mapped=kallisto_mapped_reads/n_reads) 
samples %>%
  ggplot(aes(x = percent_mapped)) +
    geom_histogram(bins = 10) +
    labs(title = "Distribution of % Mapped Reads", x = "% Mapped Reads")
```

```{r}
# Filter samples
samples_filt <-samples %>% 
  filter(
    !SRR %in% c("SRR8526726"),
    #percent_mapped > 0.8
    )
```



# Sleuth
```{r include=FALSE}
# Setup sleuth object
s_obj <- sleuth_prep(
  sample_to_covariates = rename(samples_filt,path=kallisto_path,sample=SRR),
  full_model = ~Response,
  transform_fun_counts=function(x)log2(x+1),
  target_mapping = rename(t2g,target_id=ensembl_transcript_id_version) %>%
      distinct(target_id,external_gene_name),
  aggregation_column = "external_gene_name",
  gene_mode=TRUE
)
```

## PCA 

### Sleuth default
```{r}
plot_pca(s_obj,units="scaled_reads_per_base",color_by = "Response",use_filtered = TRUE,text_labels = F) + geom_label(aes(label=sample),size=2)
```

### Standardized PCA
```{r}

sleuth_rna_norm <- s_obj$obs_norm_filt %>% 
  select(sample,target_id,scaled_reads_per_base) %>%
  spread(key=target_id,value=scaled_reads_per_base) %>%
  column_to_rownames("sample")
sleuth_std_pca <- prcomp((sleuth_rna_norm),center=T,scale.=T)
sleuth_variance_explained = summary(sleuth_std_pca)$importance[2,1:2]
sleuth_std_pca$x[,1:2] %>%
  as.data.frame() %>%
  rownames_to_column("SRR") %>%
  left_join(samples,by="SRR") %>%
  ggplot(aes(x=PC1,y=PC2,color=Response,label=SRR)) +
    geom_point(size=3) +
    labs(title="PCA Plot of Sleuth Normalized Kallisto Quantified Data",
         x = paste0("PC1: ", round(sleuth_variance_explained[1]*100),"% Variance"),
         y = paste0("PC2: ", round(sleuth_variance_explained[2]*100),"% Variance")) +
    scale_color_manual(values=RESPONSE_COLORS)
```

## Diffex

### LRT
```{r}
s_obj <- s_obj %>%
  sleuth_fit(~Response,"full") %>%
  sleuth_fit(~1,"reduced") %>%
  sleuth_lrt("reduced","full")
```
```{r}
sleuth_diffex_lrt <- sleuth_results(s_obj,test="reduced:full",test_type="lrt") %>% filter(!is.na(pval))
sleuth_diffex_lrt %>% arrange(qval)
```

### Wald
```{r}
s_obj <- sleuth_wt(s_obj,"ResponseR")
sleuth_diffex_wald <- sleuth_results(s_obj,test="ResponseR",test_type="wald")  %>% filter(!is.na(pval))
sleuth_diffex_wald %>% arrange(qval) 
```
```{r}
plot_volcano(s_obj,test_type="wt",test="ResponseR",sig_level=0.05) +
  labs(title="Volcano Plot of Sleuth Identified Differentially Expressed Genes",
       x="log2FC", y = "qval",
        color="Q < 0.05")
```

## QC
```{r}
meanSdPlot(log2(sleuth_to_matrix(s_obj,"obs_norm","scaled_reads_per_base") + 1),rank=FALSE,bins=100)
```

# DESeq (Kallisto)
```{r}
files <- file.path(samples_filt$kallisto_path,"abundance.h5")
names(files) <- samples_filt$`Patient ID`
txi <- tximport(
  file.path(samples_filt$kallisto_path,"abundance.h5"),
  type = "kallisto",
  tx2gene = select(
    t2g,
    TXNAME = ensembl_transcript_id_version,
    GENEID = external_gene_name
  ),
  ignoreAfterBar = TRUE,
)
rownames(samples_filt) <- samples_filt$`Patient ID`
deseq_kallisto_data <- DESeqDataSetFromTximport(txi,samples_filt,~Response)
kallisto_keep <- rowSums(counts(deseq_kallisto_data)>5)>5
deseq_kallisto_data <- deseq_kallisto_data[kallisto_keep,]
```

## QC
```{r}
deseq_kallisto_norm <- vst(deseq_kallisto_data)
meanSdPlot(assay(deseq_kallisto_norm),ranks=FALSE,bins=100)

```


## PCA
```{r}
plotPCA(deseq_kallisto_norm,intgroup="Response",ntop=500000)   +
  labs(title="PCA Plot of DESeq Normalized Kallisto Quantified Data",
       color="Response") +
  scale_color_manual(values=RESPONSE_COLORS)
```
## Difex
```{r}
deseq_kallisto_data <- DESeq(deseq_kallisto_data)
deseq_kallisto_lrt <- results(deseq_kallisto_data,name="Response_R_vs_NR",tidy=TRUE)
deseq_kallisto_lrt %>% 
  #filter(padj < 0.05) %>%
  arrange(padj) %>%
  select(row,log2FoldChange,pvalue,padj) %>%
  left_join(select(t2g,ensembl_gene_id,external_gene_name),by=c("row"="ensembl_gene_id"))
```

# DESeq (FeatureCounts)
```{r}
fc_mat <- read_tsv("../work/input/GSE126044_counts.txt")
 fc_mat <- fc_mat[!grepl("^\\d+\\-",fc_mat$X1),] %>%
   column_to_rownames("X1")
 
fc_mat_filt <-  fc_mat %>%
   select(one_of(samples_filt$`Patient ID`))
deseq_fc <- DESeqDataSetFromMatrix(countData=fc_mat_filt,colData=samples_filt,design=~Response)
fc_keep <- rowSums(counts(deseq_fc)>5)>5
deseq_fc <- deseq_fc[fc_keep,]
```

## QC
```{r}
deseq_fc_norm <- vst(deseq_fc)
meanSdPlot(assay(deseq_fc_norm),rank=FALSE,bins=100)
```


## PCA
```{r}
plotPCA(deseq_fc_norm,intgroup="Response",ntop=500000) +
  labs(title="PCA Plot of DESeq Normalized Feature Counts/STAR Quantified Data",
       color="Response") +
  scale_color_manual(values=RESPONSE_COLORS)
```

## Difex
```{r}
deseq_fc <- DESeq(deseq_fc)
deseq_fc_lrt <- results(deseq_fc,name="Response_R_vs_NR",tidy=TRUE)
deseq_fc_lrt %>% 
 # filter(padj < 0.05) %>%
  arrange(padj) %>%
  select(row,log2FoldChange,pvalue,padj)
```

# Merged Analyses

## counts_to_rpm, pca
```{r}
tfc <- t(fc_mat_filt)
# Convert Kallisto counts to rpm
kalread<-cbind(kallisto_mapped_reads=samples_filt$kallisto_mapped_reads, sleuth_rna_norm %>% rownames_to_column("sample"))
kalrpm<-setDT(kalread) [, lapply(.SD, function(x) (x * 1e6) / (kallisto_mapped_reads)),  by=.(kallisto_mapped_reads,sample)] %>% 
  column_to_rownames("sample")

# Convert star fc to rpm
#get df with est total reads for star
staread<-cbind(star_est_mapped=colSums(fc_mat_filt), as.data.frame(tfc) %>% rownames_to_column("sample"))
starpm<-setDT(staread) [, lapply(.SD, function(x) (x * 1e6) / (star_est_mapped)),  by=.(star_est_mapped,sample)] %>%
  column_to_rownames("sample")

## scale pca with rpm
kal_rpm_pca <- prcomp(kalrpm[,-1],center=TRUE,scale.=TRUE)

starnorm_std <- scale(starpm[,-1])
#star counts include some zero, set scale/std apply to those !=0
star_rpm_pca <- prcomp(starnorm_std[ , which(apply(starnorm_std, 2, var) != 0)],center=TRUE)

## superimposed pca plots from the two sources
p1 <-kal_rpm_pca$x[,1:2]%>%
  as.data.frame() %>%
  rownames_to_column("SRR") %>%
  left_join(samples_filt,by="SRR")
p2<-star_rpm_pca$x[,1:2] %>%
  as.data.frame() %>%
  rownames_to_column("Patient ID") %>%
  left_join(samples_filt,by="Patient ID")
bind_rows(Kallisto=p1,STAR=p2,.id="quant_type") %>%
  ggplot(aes(x=PC1,y=PC2,color=Response,shape=quant_type)) +
    geom_point(size=3) +
    scale_color_manual(values=RESPONSE_COLORS) + 
    labs(title="Superimposed PCA of RPM Normalized Counts",
         shape="Method")
```



## Comparison of log2FC
```{r}
equation = function(x) {
  lm_coef <- list(a = round(as.numeric(coef(x)[1]), digits = 2),
                  b = round(as.numeric(coef(x)[2]), digits = 2),
                  r2 = round(summary(x)$r.squared, digits = 2));
  lm_eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,lm_coef)
  as.character(as.expression(lm_eq));                 
}

log2fc_combined <- sleuth_diffex_wald %>%
  filter(!is.na(b)) %>%
  select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
  left_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
  left_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row")) %>%
  filter(!is.na(deseq_fc_log2FC))

regression.line <- lm(deseq_kal_log2FC ~ sleuth_log2FC, data = log2fc_combined)
log2fc_combined %>%
  ggplot(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
  geom_point(aes(x=sleuth_log2FC,y=deseq_kal_log2FC)) +
  geom_hline(yintercept=0, color="darkgray") + 
  geom_vline(xintercept=0, color="darkgray") +
  theme_pubr(base_size = 14)+
  geom_smooth(method = "lm", color="red") +
  annotate("text", label = equation(regression.line), x = -3, y = 4, parse = TRUE)+
  labs(title="Comparison of log2FC between Sleuth and DESeq Kallisto",
       x="log2 Fold Change (Sleuth-Kallisto)",y="log2 Fold Change (DESeq-Kallisto)")


regression.line <- lm(deseq_fc_log2FC ~ deseq_kal_log2FC, data = log2fc_combined)
log2fc_combined %>%
  ggplot(aes(x=deseq_kal_log2FC,y=deseq_fc_log2FC)) +
  geom_point() +
  geom_hline(yintercept=0, color="darkgray") + 
  geom_vline(xintercept=0, color="darkgray") +
  theme_pubr(base_size = 14)+
  geom_smooth(method = "lm", color="red") +
  annotate("text", label = equation(regression.line), x = -4, y = 5, parse = TRUE)+
  labs(title="Comparison of log2FC between DESeq From Kallisto and FC",
       x="log2 Fold Change (DESeq-FeatureCounts)",y="log2 Fold Change (DESeq-Kallisto)")

```



## combine all logFC data
```{r}
sleuth_diffex_wald <- sleuth_diffex_wald %>% filter(!is.na(b)) 
deseq_fc_lrt <- deseq_fc_lrt %>% filter(!is.na(log2FoldChange)) 
deseq_kallisto_lrt <- deseq_kallisto_lrt %>% filter(!is.na(log2FoldChange)) 

log2fc_full_combined <- sleuth_diffex_wald %>%
  select(gene=target_id,sleuth_log2FC=b,sleuth_q=qval) %>%
  full_join(select(deseq_kallisto_lrt,row,deseq_kal_log2FC=log2FoldChange,deseq_kal_q=padj),by=c("gene"="row")) %>%
  full_join(select(deseq_fc_lrt,row,deseq_fc_log2FC=log2FoldChange,deseq_fc_q=padj),by=c("gene"="row"))
log2fc_full_combined

```

## Volcano Plots

### Slueth
```{r}

log2fc_full_combined$log2FoldChange <- log2fc_full_combined$sleuth_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$sleuth_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -3))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 3))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)

table(keyvals.color)
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - sleuth',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,1.5),
                selectLab = log2fc_final$gene[up.down.genes]
                )

```

### deseq_kal
```{r}

log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_kal_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_kal_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -20))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 5))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)

table(keyvals.color)
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - Deseq-kallisto',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,10),
                selectLab = log2fc_final$gene[up.down.genes]
                )

```


### deseq_fc
```{r}
log2fc_full_combined$log2FoldChange <- log2fc_full_combined$deseq_fc_log2FC
log2fc_full_combined$padj <- log2fc_full_combined$deseq_fc_q
log2fc_final <- log2fc_full_combined %>% filter(!is.na(log2FoldChange)) 

thres <- 1
keyvals.color <- rep('green', nrow(log2fc_final))
# set the base name/label as 'Mid'
names(keyvals.color) <- rep('Mid', nrow(log2fc_final))

keyvals.color[which(log2fc_final$log2FoldChange > thres)] <- 'red'
names(keyvals.color)[which(log2fc_final$log2FoldChange > thres)] <- 'UpRegulated'

keyvals.color[which(log2fc_final$log2FoldChange < -thres)] <- 'royalblue'
names(keyvals.color)[which(log2fc_final$log2FoldChange < -thres)] <- 'DownRegulated'

down.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange <= -8))
up.genes  <- which((log2fc_final$padj < 0.3) & (log2fc_final$log2FoldChange >= 4))
up.down.genes = c(down.genes, up.genes)
length(up.down.genes)

table(keyvals.color)
EnhancedVolcano(log2fc_final,
                lab = (log2fc_final$gene),
                x = 'log2FoldChange',
                y = 'padj', 
                title = 'R vs. NR  - Deseq-fc',
                drawConnectors = TRUE,
                widthConnectors = 0.3,
                colConnectors = 'grey30',
                colCustom = keyvals.color,
                pCutoff = NA,
                cutoffLineType = 'blank',
                vline = c(-thres,thres),
                vlineCol = c('grey50', 'grey50'),
                gridlines.major = FALSE,
                gridlines.minor = FALSE,
                ylim = c(0,15),
                selectLab = log2fc_final$gene[up.down.genes]
                )

```


## Venn diagram
```{r}
thres <- 1
# get the name of the upregulated/downregulated genes from 3 different methods
sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
deseq.k.up <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange >= thres))]
deseq.k.down <- deseq_kallisto_lrt$row[which((deseq_kallisto_lrt$padj <= 0.05) & (deseq_kallisto_lrt$log2FoldChange <= -thres))]
deseq.fc.up <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange >= thres))]
deseq.fc.down <- deseq_fc_lrt$row[which((deseq_fc_lrt$padj <= 0.05) & (deseq_fc_lrt$log2FoldChange <= -thres))]

# up-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.up),
                            "DESeq-Kallisto" = (deseq.k.up),
                            "DESeq-FeatureCounts" = (deseq.fc.up)),
                    main="Venn Diagram of Upregulated Genes Between 3 Methods",
                   fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)

# down-regulated venn-diagram
grid.newpage()
vd <- venn.diagram(x = list("Sleuth-Kallisto" = (sleuth.down),
                            "DESeq-Kallisto" = (deseq.k.down),
                            "DESeq-FeatureCounts" = (deseq.fc.down)),
                   main="Venn Diagram of Downregulated Genes Between 3 Methods",
                   fill = brewer.pal(4, "Set2")[1:3], filename = NULL)
grid.draw(vd)

```


## Heatmap
```{r}

#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]

normalized_counts <- counts(deseq_kallisto_data, normalized=TRUE)
colnames(normalized_counts)


updown.gene <- deseq_kallisto_lrt[which((deseq_kallisto_lrt$log2FoldChange > 1 | deseq_kallisto_lrt$log2FoldChange < -1) & deseq_kallisto_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")

updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])

heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]

zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)

col_updown = c("up" = "pink", "down" = "purple")

row.names(samples_filt) <- samples_filt$`Patient ID`
samples_filt

ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",  
                       col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)

ht1 = Heatmap(zscore_data, name = "expression",
        col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
        show_column_names = FALSE, show_row_names=FALSE,
        row_title = NULL,
        bottom_annotation = ha,
        heatmap_width = 400, heatmap_height = 100)

ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)
```

```{r}

#sleuth.up <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b >= thres))]
#sleuth.down <- sleuth_diffex_wald$target_id[which((sleuth_diffex_wald$qval <= 0.05) & (sleuth_diffex_wald$b <= -thres))]
normalized_counts <- counts(deseq_fc, normalized=TRUE)

updown.gene <- deseq_fc_lrt[which((deseq_fc_lrt$log2FoldChange > 1 | deseq_fc_lrt$log2FoldChange < -1) & deseq_fc_lrt$padj < 0.05),]
updown.gene$updown <- ifelse((updown.gene$log2FoldChange> 1), "up", "down")

updown.regul <- as.matrix(updown.gene[,"updown", drop=FALSE])

heatmapdata <- normalized_counts[match(updown.gene$row, row.names(normalized_counts)),]

zscore_data <- t(apply(heatmapdata, 1, scale))
colnames(zscore_data) = colnames(heatmapdata)

col_updown = c("up" = "pink", "down" = "purple")

row.names(samples_filt) <- samples_filt$`Patient ID`
samples_filt

ha = HeatmapAnnotation(type = samples_filt$Response, annotation_name_side = "left",  
                       col = list(type = c("NR" = "yellow", "R" = "black")))
dim(normalized_counts)

ht1 = Heatmap(zscore_data, name = "expression",
        col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
        show_column_names = FALSE, show_row_names=FALSE,
        row_title = NULL,
        bottom_annotation = ha,
        heatmap_width = 400, heatmap_height = 100)

ht2 = Heatmap(updown.regul, name = "up vs down", col = col_updown, show_row_names=FALSE)
ht_list = ht1 + ht2
draw(ht_list, row_km = 1, row_split = updown.regul, cluster_rows = TRUE)
```

# Sample overview table
```{r}
sample_summary <- data.frame(fc_mapped_reads=colSums(fc_mat)) %>% 
  rownames_to_column("Patient ID") %>% 
  left_join(samples,by="Patient ID") %>%
  select(`Patient ID`,SRR,Response,"Sequenced Reads"=n_reads,"Mapped Reads(Kallisto)"=kallisto_mapped_reads,"Mapped Reads(STAR/FeatureCounts)"=fc_mapped_reads)
sample_summary
```


# Write out relevant files
```{r}
sleuth_diffex_lrt %>% write_tsv("sleuth_diffex_lrt.tsv")
sleuth_diffex_wald %>% write_tsv("sleuth_diffex_wald.tsv")
deseq_kallisto_lrt %>% write_tsv("deseq_kallisto_lrt.tsv")
deseq_fc_lrt %>% write_tsv("deseq_fc_lrt.tsv")
sample_summary%>% write_tsv("sample_summary.tsv")
```
